home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Atari Mega Archive 1
/
Atari Mega Archive - Volume 1.iso
/
gnu
/
othergnu
/
ispell.zoo
/
munchlist.bug
< prev
next >
Wrap
Text File
|
1990-03-06
|
7KB
|
220 lines
#!/bin/sh
#
# Works correctly (where foo has these four words, one per line):
# ---------------
# % args "conformer" "conformers" "conformer/S" "test" | munchlist
#
# % cat foo | munchlist
#
# Doesn't work correctly:
# -----------------------
# % munchlist
# conformer
# conformers
# conformer/S
# test
# *** EOF ***
#
# % munchlist foo
#
# % munchlist <foo
#
#
#
# Here's the munchlist file, "traced":
#
# Given a list of words for ispell, generate a reduced list
# in which all possible suffixes have been collapsed. The reduced
# list will match the same list as the original.
#
# Usage:
#
# munchlist [ -d hashfile ] [ -e ] [ -w chars ] [ file ] ...
#
# Options:
#
# -d hashfile
# Remove any words that are covered by 'hashfile'. The
# default is the default ispell dictionary. The words
# will be removed only if all suffixes are covered by
# the hash file. A hashfile of /dev/null should be
# specified when the main dictionary is being munched.
# -e Economical algorithm. This will use much less temporary
# disk space, at the expense of time. Useful with large files
# (such as complete dictionaries).
# -w Passed on to ispell (specify chars that are part of a word)
#
# The given input files are merged, then processed by 'ispell -c'
# to generate possible suffix lists; these are then combined
# and reduced. The final result is written to standard output.
#
# For portability to older systems, I have avoided getopt.
#
# Geoff Kuenning
# 2/28/87
#
LIBDIR=//leo/yale/ram/emacs/ispell
COMBINE=${LIBDIR}/icombine
EXPAND1=${LIBDIR}/isexp1.sed
EXPAND2=${LIBDIR}/isexp2.sed
EXPAND3=${LIBDIR}/isexp3.sed
EXPAND4=${LIBDIR}/isexp4.sed
# TDIR=${TMPDIR:-/usr/tmp}
TDIR=/tmp
TMP=${TDIR}/munch$$
cheap=no
dictopt=
wchars=
while [ $# != 0 ]
do
case "$1" in
-d)
case "$2" in
/dev/null)
dictopt=NONE
;;
*)
dictopt="-d $2"
;;
esac
shift
;;
-e)
cheap=yes
;;
-w)
wchars="-w $2"
shift
;;
*)
break
esac
shift
done
trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15
#
# Collect all the input and expand all the suffix options (four sed's),
# and preserve (sorted) for later joining in ${TMP}a.
#
if [ $# -eq 0 ]
then
sed -f $EXPAND1 | sed -f $EXPAND2 \
| sed -f $EXPAND3 | sed -f $EXPAND4 | sort -u > ${TMP}a
else
sed -f $EXPAND1 "$@" | sed -f $EXPAND2 \
| sed -f $EXPAND3 | sed -f $EXPAND4 | sort -u > ${TMP}a
fi
args "" "TMPa" "--"; cat ${TMP}a; args "--"
#
# Unless an explicitly null dictionary was specified, remove all
# expanded words that are covered by the dictionary. This produces
# the final list of expanded words that this dictionary must cover.
# Leave the list in ${TMP}b.
#
if [ "X$dictopt" = "XNONE" ]
then
ln ${TMP}a ${TMP}b
else
ispell -l $dictopt -p /dev/null < ${TMP}a > ${TMP}b
fi
args "" "TMPb" "--"; cat ${TMP}b; args "--"
#
# Munch the input to generate roots and suffixes (ispell -c). We are
# only interested in words that have at least one suffix (egrep /); the
# next step will pick up the rest. Some of the roots are illegal. We
# use join to restrict the output to those root words that are found
# in the original dictionary. In cheap mode, we re-sort this for
# icombine's benefit, and then use icombine to scrunch them together.
#
# Note: one disadvantage of this pipeline is that for a large file,
# the join and icombine may be sitting around for a long time while ispell
# and sorts run. You can get rid of this by splitting the pipe, at
# the expense of more temp file space.
#
if [ $cheap = yes ]
then
ispell $wchars -c -d /dev/null -p /dev/null < ${TMP}b \
| egrep / | sort -u -t/ +0 -1 +1 \
| join -t/ - ${TMP}a \
| sort -u -t/ +0f -1 +0 -1 +1 | $COMBINE > ${TMP}c
else
ispell $wchars -c -d /dev/null -p /dev/null < ${TMP}b \
| egrep / | sort -u -t/ +0 -1 +1 \
| join -t/ - ${TMP}a > ${TMP}c
fi
args "" "TMPc" "--"; cat ${TMP}c; args "--"
#
# There is now one slight problem: the suffix flags X, J, and Z
# are simply the addition of an "S" to the suffixes N, G, and R,
# respectively. This produces redundant entries in the output file;
# for example, ABBREVIATE/N/X and ABBREVIATION/S. We must get rid
# of the unnecessary duplicates. The candidates are those words that
# have only an "S" flag (egrep). We strip off the "S" (sed), and
# generate a list of roots that might have made these words (ispell -c).
# Of these roots, we select those that have the N, G, or R flags,
# replacing each with the plural equivalent X, J, or Z (sed -n).
# Using join once again, we select those that have legal roots
# and put them in ${TMP}d.
#
if [ $cheap = yes ]
then
egrep '^[^/]*/S$' ${TMP}c | sed 's@/S$@@' \
| ispell $wchars -c -d /dev/null -p /dev/null \
| sed -n -e '/\/N/s/N$/X/p' -e '/\/G/s/G$/J/p' -e '/\/R/s/R$/Z/p' \
| sort -u -t/ +0 -1 +1 \
| join -t/ - ${TMP}a \
| sort -u -t/ +0f -1 +0 -1 +1 \
| $COMBINE > ${TMP}d
else
egrep '^[^/]*/S$' ${TMP}c | sed 's@/S$@@' \
| ispell $wchars -c -d /dev/null -p /dev/null \
| sed -n -e '/\/N/s/N$/X/p' -e '/\/G/s/G$/J/p' -e '/\/R/s/R$/Z/p' \
| sort -u -t/ +0 -1 +1 \
| join -t/ - ${TMP}a > ${TMP}d
fi
# /bin/rm -f ${TMP}a
args "" "TMPd" "--"; cat ${TMP}d; args "--"
#
# Now we have to eliminate the stuff covered by ${TMP}d from ${TMP}c.
# First, we re-expand the suffixes we just made (four sed's), and let
# ispell re-create the /S version (ispell -c). We select the /S versions
# only (egrep), sort them (sort) for comm, and use comm to delete these
# from ${TMP}c. The output of comm (i.e., the trimmed version of
# ${TMP}c) is combined with our special-suffixes file ${TMP}d (sort again)
# and reduced in size (icombine) to produce a final list of all words
# that have at least one suffix.
#
sed -f $EXPAND1 ${TMP}d | sed -f $EXPAND2 | sed -f $EXPAND3 | sed -f $EXPAND4 \
| ispell $wchars -c -d /dev/null -p /dev/null \
| egrep '\/S$' | sort -u -t/ +0 -1 +1 | tee ${TMP}test1 | comm -13 - ${TMP}c \
| tee ${TMP}test2 \
| sort -u -t/ +0f -1 +0 -1 +1 - ${TMP}d \
| $COMBINE > ${TMP}e
# /bin/rm -f ${TMP}[cd]
args "" "TMPtest1" "--"; cat ${TMP}test1; args "--"
args "" "TMPtest2" "--"; cat ${TMP}test2; args "--"
args "" "TMPe" "--"; cat ${TMP}e; args "--"
#
# Now a slick trick. Use ispell to select those (root) words from the original
# list (${TMP}b) that are not covered by the suffix list (${TMP}e). Then we
# merge these with the suffix list, sort it, and use icombine to strip out
# unnecessary capitalizations and produce the final output.
#
ispell $wchars -d /dev/null -p ${TMP}e -l < ${TMP}b \
| sort -t/ +0f -1 +0 -1 +1 - ${TMP}e \
| $COMBINE
# /bin/rm -f ${TMP}*